library(ggplot2)
library(GGally)
library(gridExtra)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:ggplot2':
##
## %+%
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:GGally':
##
## nasa
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
##
## Attaching package: 'scales'
##
## The following objects are masked from 'package:psych':
##
## alpha, rescale
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Attaching package: 'memisc'
##
## The following object is masked from 'package:scales':
##
## percent
##
## The following objects are masked from 'package:dplyr':
##
## collect, query, rename
##
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
##
## The following objects are masked from 'package:base':
##
## as.array, trimws
theme_set(theme_minimal(20))
setwd('~/repos/nanodgreep3')
wdf <- read.csv('wineQualityReds.csv')
wdf <- wdf[,-1] # Get rid of unused column : 'X'
dim(wdf)
## [1] 1599 12
names(wdf)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
str(wdf)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
levels(factor(wdf$quality))
## [1] "3" "4" "5" "6" "7" "8"
summary(wdf)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
The average quality is 5.6 and median of quality is 6.0. About 75% of wines have quality score under or equal to 6. About 75% of wines have fixed acidity less than 10.0. About 75% of wines have residual sugar less than 2.6, but its maximum value, 15.5 is very high(i.e. very sweet). All wines have similar density, from 0.99 to 1.00.
qplot(x = quality, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Quality have all integer values. Most wines have 5 or 6 score value. It can be categorized as three grades.
# Adding a simplified categorical varaible for the wine quality (NG, GD, EX)
wdf['taste'] <- "GD"
wdf$taste[wdf$quality == 3 | wdf$quality == 4] <- "NG"
wdf$taste[wdf$quality == 5 | wdf$quality == 6] <- "GD"
wdf$taste[wdf$quality == 7 | wdf$quality == 8] <- "EX"
wdf$taste <- factor(wdf$taste, levels=c("NG", "GD", "EX"))
# Adding a categorical variable corresponding to quality variable(3~8)
wdf['taste.detail'] <- factor(wdf$quality, levels = c("3", "4", "5", "6", "7", "8"))
qplot(x = taste, data = wdf)
summary(wdf$taste)
## NG GD EX
## 63 1319 217
str(wdf$taste)
## Factor w/ 3 levels "NG","GD","EX": 2 2 2 2 2 2 2 3 3 2 ...
ggplot(wdf, aes(x = taste)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
ylab('Percentage')
summary(wdf$fixed.acidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.60 7.10 7.90 8.32 9.20 15.90
qplot(x = fixed.acidity, data = wdf, binwidth=0.25)
Most wines have a fixed.acidity between 7 and 14.
summary(wdf$volatile.acidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1200 0.3900 0.5200 0.5278 0.6400 1.5800
qplot(x = volatile.acidity, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = volatile.acidity, data = wdf, binwidth=0.01) +
coord_cartesian(xlim = c(0.1, 1.0))
Again, most of wines have a volatile.acidity between 0.2 and 1.0.
summary(wdf$citric.acid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.090 0.260 0.271 0.420 1.000
qplot(x = citric.acid, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x = citric.acid, data = wdf, binwidth=0.01)
table(wdf$citric.acid == 0)
##
## FALSE TRUE
## 1467 132
About 10% of wines have no citric.acid.
summary(wdf$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
qplot( x= residual.sugar, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot( x= residual.sugar, data = wdf, binwidth = 0.1) +
coord_cartesian(xlim = c(0.9, 12.0))
qplot(x=residual.sugar,
data = wdf,
binwidth = 0.1) +
coord_cartesian(xlim = c(0.9, 12.0), ylim=c(1, 150)) +
scale_y_log10()
## Warning: Stacking not well defined when ymin != 0
Residual sugar of most wines varies from 1.0 to 7.0.
summary(wdf$chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01200 0.07000 0.07900 0.08747 0.09000 0.61100
qplot(x=chlorides, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=chlorides, data = wdf, binwidth = 0.001) +
coord_cartesian(xlim = c(0.05, 0.2))
qplot(x=chlorides,
data = wdf,
binwidth = 0.001) +
coord_cartesian(xlim = c(0.05, 0.2), ylim = c(1, 100)) +
scale_y_log10()
## Warning: Stacking not well defined when ymin != 0
Most wines have a chlorides ranging from 0.05 to 0.2.
summary(wdf$free.sulfur.dioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 14.00 15.87 21.00 72.00
qplot(x=free.sulfur.dioxide, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=free.sulfur.dioxide, data=wdf, binwidth=1) +
coord_cartesian(xlim=c(0, 60))
Most wines have free.sulfur.dioxide under 40.
summary(wdf$total.sulfur.dioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 22.00 38.00 46.47 62.00 289.00
qplot(x=total.sulfur.dioxide, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=total.sulfur.dioxide, data=wdf, binwidth = 1) +
coord_cartesian(xlim=c(5, 160))
Again, most wine have total.sulfur.dioxide from 5 to 160.
summary(wdf$density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9901 0.9956 0.9968 0.9967 0.9978 1.0040
qplot(x=density, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
qplot(x=density, data=wdf, binwidth=0.0001)
## Warning: position_stack requires constant width: output may be incorrect
var(wdf$density)
## [1] 3.562029e-06
Most wine have almost same density, because its variance is 3.5e-06, so small.
summary(wdf$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.740 3.210 3.310 3.311 3.400 4.010
qplot(x=pH, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=pH, data=wdf, binwidth = 0.01)
Of course, all wines are acidic(i.e. under pH 7), because all wines have pH from 2.5 to 4.010.
summary(wdf$sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3300 0.5500 0.6200 0.6581 0.7300 2.0000
qplot(x=sulphates, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=sulphates, data=wdf, binwidth = 0.01) +
coord_cartesian(xlim=c(0.3, 1.5))
Also, most wines’ sulphates have range from 0.3 to 1.2.
The main features in this data are alcohol, residual.sugar, and quality(taste). I want to verify that which features among alcohol and residual.sugar determine a wine’s better flavor.
Some people enjoy citric.acid flavor, or density. Therefore, two features can be one of factors for good taste.
I created a variable for ‘taste’ using quality variable. Because quality has only integer variable, so I think converting it to a categorical variable is good idea. In addition, I simplified 6 steps(3~8) to 3 steps(NG, GD, EX).
The quality variable is not tidy. It have just integer values. I think there is no need to use the feature as integer. So I decided to convert it to ordered factor with 3-levels.
## citric.acid residual.sugar density alcohol
## citric.acid 1.0000000 0.14357716 0.3649472 0.10990325
## residual.sugar 0.1435772 1.00000000 0.3552834 0.04207544
## density 0.3649472 0.35528337 1.0000000 -0.49617977
## alcohol 0.1099032 0.04207544 -0.4961798 1.00000000
## quality 0.2263725 0.01373164 -0.1749192 0.47616632
## quality
## citric.acid 0.22637251
## residual.sugar 0.01373164
## density -0.17491923
## alcohol 0.47616632
## quality 1.00000000
# I don't need to extract all information from the data
# It will require large computation to a computer.
pairs.panels(wdf[, c(3, 4, 8, 11, 12)], pch=".")
The alcohol feature has the most impact to quality. It is not ideal correlation, but it is quite high. Interesting thing is that residual.sugar is not correlated with quality. That menas the sweet wine doesn’t implies better flavored wine.
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
qplot(x=residual.sugar, y=quality, data=wdf)
ggplot(aes(x=residual.sugar, y=quality), data=wdf) +
geom_jitter(alpha = 0.33)
ggplot(aes(x=residual.sugar, y=quality),
data=subset(wdf, residual.sugar > 0 &
residual.sugar <= quantile(wdf$residual.sugar, 0.99))) +
geom_jitter(alpha = 0.33) +
geom_smooth(method = 'lm', se = T, color = 'blue')
Actually, residual.sugar cannot show a tendecy with quality feature. It turns out that sugar and quality don’t have much relationship.
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
sugarAndQuality <- lm(quality ~ residual.sugar,
data=subset(wdf, residual.sugar > 0 &
residual.sugar <= quantile(wdf$residual.sugar, 0.99)))
summary(sugarAndQuality)
##
## Call:
## lm(formula = quality ~ residual.sugar, data = subset(wdf, residual.sugar >
## 0 & residual.sugar <= quantile(wdf$residual.sugar, 0.99)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6743 -0.6319 0.3560 0.3717 2.3778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.60528 0.05277 106.220 <2e-16 ***
## residual.sugar 0.01211 0.01993 0.608 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.809 on 1581 degrees of freedom
## Multiple R-squared: 0.0002335, Adjusted R-squared: -0.0003989
## F-statistic: 0.3692 on 1 and 1581 DF, p-value: 0.5435
R^2 describes that sugar and quality have almost zero relationship.
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
qplot(x=alcohol, y=quality, data=wdf)
ggplot(aes(x=alcohol, y=quality), data=subset(wdf, alcohol > 0 &
alcohol <= quantile(wdf$alcohol, 0.99))) +
geom_jitter(alpha = 0.33) +
geom_smooth(method = 'lm', se = T, color = 'blue')
This plot shows a tendency syaing that the more alcohol, the higer quality values.
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
alcholAndQuality <- lm(quality ~ alcohol,
data=subset(wdf, alcohol > 0 &
alcohol <= quantile(wdf$alcohol, 0.99)))
summary(alcholAndQuality)
##
## Call:
## lm(formula = quality ~ alcohol, data = subset(wdf, alcohol >
## 0 & alcohol <= quantile(wdf$alcohol, 0.99)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8535 -0.4077 -0.1848 0.5180 2.5923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.76698 0.18234 9.691 <2e-16 ***
## alcohol 0.37150 0.01746 21.275 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.708 on 1583 degrees of freedom
## Multiple R-squared: 0.2224, Adjusted R-squared: 0.2219
## F-statistic: 452.6 on 1 and 1583 DF, p-value: < 2.2e-16
R^2 value is 0.22, that means alcohol explains about 22% of the wine quality.
pairs.panels(wdf[, c(1, 3, 5, 6, 7, 9, 10, 12)], pch=".")
Chemically, acid has low pH values. In this matrix plot, we can verify the fact. We can observe that fixed.acidity and citric.acid have negative correlation with pH.
ggplot(aes(x=citric.acid, y=quality), data=wdf) +
geom_jitter(alpha=0.5) +
geom_smooth(method = 'lm', se = T, color = 'blue')
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
ggplot(aes(x=sulphates, y=quality),
data=subset(wdf, sulphates > 0 &
sulphates <= quantile(wdf$sulphates, 0.99))) +
geom_jitter(alpha=0.5) +
geom_smooth(method = 'lm', se = T, color = 'blue')
In addition, sulphates and citric.acid has also relatively high correlation value with quality.
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
citricAndQuality <- lm(quality ~ citric.acid,
data=subset(wdf, citric.acid > 0 &
citric.acid <= quantile(wdf$citric.acid, 0.99)))
summary(citricAndQuality)
##
## Call:
## lm(formula = quality ~ citric.acid, data = subset(wdf, citric.acid >
## 0 & citric.acid <= quantile(wdf$citric.acid, 0.99)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9979 -0.6018 0.1152 0.4642 2.5962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.37549 0.03904 137.697 < 2e-16 ***
## citric.acid 0.94312 0.11449 8.238 3.88e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7807 on 1449 degrees of freedom
## Multiple R-squared: 0.04474, Adjusted R-squared: 0.04408
## F-statistic: 67.86 on 1 and 1449 DF, p-value: 3.877e-16
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
sulphatesAndQuality <- lm(quality ~ sulphates,
data=subset(wdf, sulphates > 0 &
sulphates <= quantile(wdf$sulphates, 0.99)))
summary(sulphatesAndQuality)
##
## Call:
## lm(formula = quality ~ sulphates, data = subset(wdf, sulphates >
## 0 & sulphates <= quantile(wdf$sulphates, 0.99)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.02595 -0.51097 -0.02595 0.47064 2.39707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.44423 0.09018 49.28 <2e-16 ***
## sulphates 1.83920 0.13573 13.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7653 on 1581 degrees of freedom
## Multiple R-squared: 0.1041, Adjusted R-squared: 0.1035
## F-statistic: 183.6 on 1 and 1581 DF, p-value: < 2.2e-16
However, their R^2 values do not exaplain much information with quality.
In the alcohol case, as its value is higer, I can see the better quality wine. However, in residual.sugar case, it was surprising. At first, I thought sweet flavor can be main factor of high quality wine, but it turned out they are totally not related.
pH is correlated with fixed.acidity and citric.acid, and it makes sense by chemical principle. They correlated with each other, that is, they can be repeated information.
The strongest relationship is alcohol. It seems to affect quality value by statistics analysis. The others, including sugar, citric acid, and sulphaste are too weak to discuss the relation with quality.
I converted quality to categorical features, taste(3-levels). - NG : quality 3,4 - GD : quality 5,6 - EX : quality 7,8
And, I just converted quality to factor variable, taste.detail.
These four plots shows the followings :
qplot(x=taste, y=alcohol, data=wdf, geom='boxplot')
qplot(x=taste, y=residual.sugar, data=wdf, geom='boxplot')
qplot(x=taste, y=citric.acid, data=wdf, geom='boxplot')
qplot(x=taste, y=sulphates, data=wdf, geom='boxplot')
qplot(x=taste.detail, y=alcohol, data=wdf, geom='boxplot')
qplot(x=taste.detail, y=residual.sugar, data=wdf, geom='boxplot')
qplot(x=taste.detail, y=citric.acid, data=wdf, geom='boxplot')
qplot(x=taste.detail, y=sulphates, data=wdf, geom='boxplot')
In this time, I tried to show boxplot for conveying same trend as the previous.
qplot(x=alcohol, y = citric.acid, color=taste, data=wdf)
Nothing stands out in the plot above. I want to compare only NG and EX, not GD. GD cases are too many.
qplot(x=alcohol, y = citric.acid, color=taste, data=subset(wdf, taste != "GD"))
It can be divided diagonally. This proves agian that alcohol and citric.acid have positive relationship with wine quality.
m1 <- lm(I(quality) ~ I(alcohol), data = wdf)
m2 <- update(m1, ~ . + sulphates)
m3 <- update(m2, ~ . + citric.acid)
mtable(m1, m2, m3)
##
## Calls:
## m1: lm(formula = I(quality) ~ I(alcohol), data = wdf)
## m2: lm(formula = I(quality) ~ I(alcohol) + sulphates, data = wdf)
## m3: lm(formula = I(quality) ~ I(alcohol) + sulphates + citric.acid,
## data = wdf)
##
## =============================================
## m1 m2 m3
## ---------------------------------------------
## (Intercept) 1.875*** 1.375*** 1.434***
## (0.175) (0.177) (0.176)
## I(alcohol) 0.361*** 0.346*** 0.338***
## (0.017) (0.016) (0.016)
## sulphates 0.994*** 0.814***
## (0.102) (0.107)
## citric.acid 0.513***
## (0.093)
## ---------------------------------------------
## R-squared 0.227 0.270 0.284
## adj. R-squared 0.226 0.269 0.282
## sigma 0.710 0.690 0.684
## F 468.267 294.988 210.501
## p 0.000 0.000 0.000
## Log-likelihood -1721.057 -1675.142 -1659.955
## Deviance 805.870 760.894 746.576
## AIC 3448.114 3358.284 3329.910
## BIC 3464.245 3379.793 3356.795
## N 1599 1599 1599
## =============================================
As we added the main features to linear model, R^2 value were getting higher.
The relationship between Red wine quality and alcohol, citric.acid, can be shown again by using categorical variables. It is more intuitive than just numerical values.
As I investigated, alcohol and citric acid are the main factor of red wine quality.
Trying to visualize two extreme value(NG and EX) was very successful. Particulary, if you have too many samples, it will be good idea to compare two extreme ones.
When someone tries a wine, he or she has an instinct feeling one of 3 types(“not good”, “good”, or “excellent”.) Therefore, I added new variable ‘taste’ for intuitive analysis and visual simplicity.
I had a reasoning that a sweet wine tends to get excellent grade, because people love sweet drinks. However, it tunred out to be wrong. Sugar is not a crucial factor to determine red wine’s quality. Every grade has almost same amount of residual sugar.
Alcohol is the main reason for people to drink red wine, or other liquors, so I suspect it is main feature for high quality wine. In addition, potassium sulphate is an unique ingredient for making a red wine, so it is one of reasons that people in the world enjoy a red wine. As I expeceted, “EX”-graded red wine tends to have more alcohol, than the others have. Sulphate have a positive tendency as 3-taste grade is going higher.
The red wine data was very tidy. So it was very convinient to handle data. Every feature have just numerical type. So I need to convert some numerical variable to categorical variable, e.g., taste and taste.detail.
It turned out that alcohol and sulphates are the main factor to determine high grade of wine, based on statistical linear modeling. However, their R square is not big enough.
For future work, we can have chances to apply other nonlinear statistic models. It will lead us to a new aspect to analyze this red wine data.